[FRAUD] df02 accuracy 0.9707 f1은 망함

Author

김보람

Published

September 5, 2023

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

# gnn
import torch
import torch_geometric
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:18: UserWarning: An issue occurred while importing 'pyg-lib'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/libpyg.so: undefined symbol: _ZN2at4_ops12split_Tensor4callERKNS_6TensorEN3c106SymIntEl
  warnings.warn(f"An issue occurred while importing 'pyg-lib'. "
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:31: UserWarning: An issue occurred while importing 'torch-scatter'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_scatter/_scatter_cuda.so: undefined symbol: _ZNK3c107SymBool10guard_boolEPKcl
  warnings.warn(f"An issue occurred while importing 'torch-scatter'. "
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:42: UserWarning: An issue occurred while importing 'torch-sparse'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_sparse/_diag_cuda.so: undefined symbol: _ZN3c106detail19maybe_wrap_dim_slowIlEET_S2_S2_b
  warnings.warn(f"An issue occurred while importing 'torch-sparse'. "

- fraudTrain

fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

- df02

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02 = df02.reset_index()

- df_toy

df_toy=df02[:5].copy()
df_toy.cc_num = pd.Series([1,1,1,2,2])
df_toy
index trans_date_trans_time cc_num merchant category amt first last gender street ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 669418 2019-10-12 18:21:00 1 fraud_Haley, Jewess and Bechtelar shopping_pos 7.53 Debra Stark F 686 Linda Rest ... 32.3836 -94.8653 24536 Multimedia programmer 1983-10-14 d313353fa30233e5fab5468e852d22fc 1350066071 32.202008 -94.371865 0
1 32567 2019-01-20 13:06:00 1 fraud_Turner LLC travel 3.79 Judith Moss F 46297 Benjamin Plains Suite 703 ... 39.5370 -83.4550 22305 Television floor manager 1939-03-09 88c65b4e1585934d578511e627fe3589 1327064760 39.156673 -82.930503 0
2 156587 2019-03-24 18:09:00 1 fraud_Klein Group entertainment 59.07 Debbie Payne F 204 Ashley Neck Apt. 169 ... 41.5224 -71.9934 4720 Broadcast presenter 1977-05-18 3bd9ede04b5c093143d5e5292940b670 1332612553 41.657152 -72.595751 0
3 1020243 2020-02-25 15:12:00 2 fraud_Monahan-Morar personal_care 25.58 Alan Parsons M 0547 Russell Ford Suite 574 ... 39.6171 -102.4776 207 Network engineer 1955-12-04 19e16ee7a01d229e750359098365e321 1361805120 39.080346 -103.213452 0
4 116272 2019-03-06 23:19:00 2 fraud_Kozey-Kuhlman personal_care 84.96 Jill Flores F 639 Cruz Islands ... 41.9488 -86.4913 3104 Horticulturist, commercial 1981-03-29 a0c8641ca1f5d6e243ed5a2246e66176 1331075954 42.502065 -86.732664 0

5 rows × 23 columns

- df_toy 에서 time_difference 구함

고객1

df_toy.iloc[0].trans_date_trans_time.value - df_toy.iloc[1].trans_date_trans_time.value
22914900000000000
df_toy.iloc[0].trans_date_trans_time.value - df_toy.iloc[2].trans_date_trans_time.value
17453520000000000
df_toy.iloc[1].trans_date_trans_time.value - df_toy.iloc[2].trans_date_trans_time.value
-5461380000000000

고객2

df_toy.iloc[3].trans_date_trans_time.value - df_toy.iloc[4].trans_date_trans_time.value
30729180000000000

고객1,2

def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result
groups = df_toy.groupby('cc_num')
edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
edge_index_list_plus_nparr
array([[                0,                 0,                 0],
       [                0,                 1, 22914900000000000],
       [                0,                 2, 17453520000000000],
       [                1,                 0, 22914900000000000],
       [                1,                 1,                 0],
       [                1,                 2,  5461380000000000],
       [                2,                 0, 17453520000000000],
       [                2,                 1,  5461380000000000],
       [                2,                 2,                 0],
       [                3,                 3,                 0],
       [                3,                 4, 30729180000000000],
       [                4,                 3, 30729180000000000],
       [                4,                 4,                 0]])

- df02에서 time_difference 구함

# t1 = time.time()
# groups = df02.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus02.npy', edge_index_list_plus_nparr)
# t2 = time.time()
# t2-t1
groups = df02.groupby("cc_num")
edge_index_list_plus02[:,2] = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))
edge_index_list_plus02
array([[  2881,   2881,      0],
       [  2881,   3061,      0],
       [  2881,   4867,      0],
       ...,
       [212771, 212765,      0],
       [212771, 212769,      0],
       [212771, 212771,      0]])
edge_index_list_plus02 = np.load('edge_index_list_plus02.npy').astype(np.float64)
theta = edge_index_list_plus02[:,2].mean()
edge_index_list_plus02[:,2] = (np.exp(-edge_index_list_plus02[:,2]/theta) != 1)*(np.exp(-edge_index_list_plus02[:,2]/theta))
edge_index_list_plus02
array([[2.88100000e+03, 2.88100000e+03, 0.00000000e+00],
       [2.88100000e+03, 3.06100000e+03, 1.96061280e-01],
       [2.88100000e+03, 4.86700000e+03, 8.12918172e-01],
       ...,
       [2.12771000e+05, 2.12765000e+05, 9.97708695e-01],
       [2.12771000e+05, 2.12769000e+05, 9.99923197e-01],
       [2.12771000e+05, 2.12771000e+05, 0.00000000e+00]])
edge_index_list_plus02.shape
(65831594, 3)
edge_index_list_updated = edge_index_list_plus02.tolist()
np.array(edge_index_list_updated)[:,2].mean()
0.4536043999922591
mm = np.array(edge_index_list_updated)[:,2].mean()
selected_edges = [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected.shape
torch.Size([2, 29970380])

tr/test

df02_tr,df02_test = sklearn.model_selection.train_test_split(df02, random_state=42)
df02_tr.shape, df02_test.shape
((160890, 23), (53630, 23))
N = len(df02)
train_mask = [i in df02_tr.index for i in range(N)]
test_mask = [i in df02_test.index for i in range(N)]
train_mask = np.array(train_mask)
test_mask = np.array(test_mask)
train_mask.shape, test_mask.shape
((214520,), (214520,))

data

x = torch.tensor(df02['amt'], dtype=torch.float).reshape(-1,1)
y = torch.tensor(df02['is_fraud'],dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
data
Data(x=[214520, 1], edge_index=[2, 29970380], y=[214520], train_mask=[214520], test_mask=[214520])
torch.manual_seed(202250926)
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 32)
        self.conv2 = GCNConv(32,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
    

X = (data.x[data.train_mask]).numpy()
XX = (data.x[data.test_mask]).numpy()
y = (data.y[data.train_mask]).numpy()
yy = (data.y[data.test_mask]).numpy()


model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
for epoch in range(400):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
    
    model.eval()

pred = model(data).argmax(dim=1)
yyhat = pred[data.test_mask]

metrics = [sklearn.metrics.accuracy_score,
           sklearn.metrics.precision_score,
           sklearn.metrics.recall_score,
           sklearn.metrics.f1_score]

_results2= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2
accuracy_score precision_score recall_score f1_score
분석2 0.973112 0.72695 0.130573 0.221382
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(np.array(data.test_mask).sum())
print(f'Accuracy: {acc:.4f}')
Accuracy: 0.9707
predicted_labels = pred[data.test_mask]
true_labels = data.y[data.test_mask]
precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
Precision: 0.4854
Recall: 0.5000
F1 Score: 0.4926
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))